{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/fake-news/indonesian/250%20news%20with%20valid%20hoax%20label.csv\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/fake-news/indonesian/600%20news%20with%20valid%20hoax%20label.csv\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/fake-news/indonesian/facts.csv\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news/fake-news/indonesian/hoax.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/husein/Malaya-Dataset/fake-news/negative/clean-gossipcop_real2.json',\n",
       " '/home/husein/Malaya-Dataset/fake-news/negative/clean-gossipcop_real3.json',\n",
       " '/home/husein/Malaya-Dataset/fake-news/negative/1.json',\n",
       " '/home/husein/Malaya-Dataset/fake-news/negative/clean-gossipcop_real1.json',\n",
       " '/home/husein/Malaya-Dataset/fake-news/negative/clean-politifact_real.json',\n",
       " '/home/husein/Malaya-Dataset/fake-news/negative/2.json']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "negatives = glob.glob('/home/husein/Malaya-Dataset/fake-news/negative/*.json')\n",
    "positives = glob.glob('/home/husein/Malaya-Dataset/fake-news/positive/*.json')\n",
    "negatives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts, labels = [], []\n",
    "for negative in negatives:\n",
    "    with open(negative) as fopen:\n",
    "        x = json.load(fopen)\n",
    "    texts.extend(x)\n",
    "    labels.extend([1] * len(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for positive in positives:\n",
    "    with open(positive) as fopen:\n",
    "        x = json.load(fopen)\n",
    "    texts.extend(x)\n",
    "    labels.extend([0] * len(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0, 1]), array([15191, 26832]))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(labels, return_counts = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>News</th>\n",
       "      <th>Label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Produk Kod E Mengandungi Lemak Babi. Awas, jan...</td>\n",
       "      <td>Fake</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jabatan Kemajuan Islam Malaysia memperjelaskan...</td>\n",
       "      <td>Real</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Roti Massimo Mengandungi DNA Babi. Roti produk...</td>\n",
       "      <td>Fake</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Jabatan Kemajuan Islam Malaysia (JAKIM) melalu...</td>\n",
       "      <td>Real</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Tiada Pembayaran Pencen Selepas Persaraan 2021...</td>\n",
       "      <td>Fake</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                News Label\n",
       "0  Produk Kod E Mengandungi Lemak Babi. Awas, jan...  Fake\n",
       "1  Jabatan Kemajuan Islam Malaysia memperjelaskan...  Real\n",
       "2  Roti Massimo Mengandungi DNA Babi. Roti produk...  Fake\n",
       "3  Jabatan Kemajuan Islam Malaysia (JAKIM) melalu...  Real\n",
       "4  Tiada Pembayaran Pencen Selepas Persaraan 2021...  Fake"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('Malaya-Dataset/fake-news/malaysia-scraping-syazanihussin.csv').dropna()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((239, 2), (232, 2))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_fake = df.loc[df['Label'] == 'Fake']\n",
    "df_real = df.loc[df['Label'] == 'Real']\n",
    "df_fake.shape, df_real.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_250 = pd.read_csv('250 news with valid hoax label.csv', sep = ';', encoding = \"ISO-8859-1\")\n",
    "df_600 = pd.read_csv('600 news with valid hoax label.csv', sep = ';', encoding = \"ISO-8859-1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "facts = pd.read_csv('facts.csv')\n",
    "hoax = pd.read_csv('hoax.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "fact_isi = facts['Isi'].tolist()\n",
    "\n",
    "hoax_judul = [i for i in hoax['Judul'].tolist() if len(i) > 25]\n",
    "hoax_isi = hoax['Isi'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "mapping = {'Valid': 1, 'Hoax': 0}\n",
    "\n",
    "labels.extend(df_250['tagging'].map(mapping).tolist())\n",
    "labels.extend(df_600['tagging'].map(mapping).tolist())\n",
    "texts.extend(df_250['berita'].tolist())\n",
    "texts.extend(df_600['berita'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts.extend(fact_isi)\n",
    "labels.extend([1] * len(fact_isi))\n",
    "\n",
    "texts.extend(hoax_judul)\n",
    "labels.extend([0] * len(hoax_judul))\n",
    "\n",
    "texts.extend(hoax_isi)\n",
    "labels.extend([0] * len(hoax_isi))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(34429, 8608)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(texts, labels, test_size = 0.2)\n",
    "len(train_X), len(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X.extend(df_fake['News'].tolist())\n",
    "train_Y.extend([0] * len(df_fake['News'].tolist()))\n",
    "\n",
    "train_X.extend(df_real['News'].tolist())\n",
    "train_Y.extend([1] * len(df_real['News'].tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "\n",
    "alphabets = '([A-Za-z])'\n",
    "prefixes = (\n",
    "    '(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt|Puan|puan|Tuan|tuan|sir|Sir)[.]'\n",
    ")\n",
    "suffixes = '(Inc|Ltd|Jr|Sr|Co)'\n",
    "starters = '(Mr|Mrs|Ms|Dr|He\\s|She\\s|It\\s|They\\s|Their\\s|Our\\s|We\\s|But\\s|However\\s|That\\s|This\\s|Wherever|Dia|Mereka|Tetapi|Kita|Itu|Ini|Dan|Kami)'\n",
    "acronyms = '([A-Z][.][A-Z][.](?:[A-Z][.])?)'\n",
    "websites = '[.](com|net|org|io|gov|me|edu|my)'\n",
    "another_websites = '(www|http|https)[.]'\n",
    "digits = '([0-9])'\n",
    "\n",
    "def split_into_sentences(text):\n",
    "    text = unidecode(text)\n",
    "    text = ' ' + text + '  '\n",
    "    text = text.replace('\\n', ' ')\n",
    "    text = re.sub(prefixes, '\\\\1<prd>', text)\n",
    "    text = re.sub(websites, '<prd>\\\\1', text)\n",
    "    text = re.sub(another_websites, '\\\\1<prd>', text)\n",
    "    if '...' in text:\n",
    "        text = text.replace('...', '<prd><prd><prd>')\n",
    "    if 'Ph.D' in text:\n",
    "        text = text.replace('Ph.D.', 'Ph<prd>D<prd>')\n",
    "    text = re.sub('\\s' + alphabets + '[.] ', ' \\\\1<prd> ', text)\n",
    "    text = re.sub(acronyms + ' ' + starters, '\\\\1<stop> \\\\2', text)\n",
    "    text = re.sub(\n",
    "        alphabets + '[.]' + alphabets + '[.]' + alphabets + '[.]',\n",
    "        '\\\\1<prd>\\\\2<prd>\\\\3<prd>',\n",
    "        text,\n",
    "    )\n",
    "    text = re.sub(\n",
    "        alphabets + '[.]' + alphabets + '[.]', '\\\\1<prd>\\\\2<prd>', text\n",
    "    )\n",
    "    text = re.sub(' ' + suffixes + '[.] ' + starters, ' \\\\1<stop> \\\\2', text)\n",
    "    text = re.sub(' ' + suffixes + '[.]', ' \\\\1<prd>', text)\n",
    "    text = re.sub(' ' + alphabets + '[.]', ' \\\\1<prd>', text)\n",
    "    text = re.sub(digits + '[.]' + digits, '\\\\1<prd>\\\\2', text)\n",
    "    if '”' in text:\n",
    "        text = text.replace('.”', '”.')\n",
    "    if '\"' in text:\n",
    "        text = text.replace('.\"', '\".')\n",
    "    if '!' in text:\n",
    "        text = text.replace('!\"', '\"!')\n",
    "    if '?' in text:\n",
    "        text = text.replace('?\"', '\"?')\n",
    "    text = text.replace('.', '.<stop>')\n",
    "    text = text.replace('?', '?<stop>')\n",
    "    text = text.replace('!', '!<stop>')\n",
    "    text = text.replace('<prd>', '.')\n",
    "    sentences = text.split('<stop>')\n",
    "    sentences = sentences[:-1]\n",
    "    sentences = [s.strip() for s in sentences if len(s) > 10]\n",
    "    return sentences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/tokenization.py:240: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.\n",
      "\n",
      "INFO:tensorflow:loading sentence piece model\n"
     ]
    }
   ],
   "source": [
    "from albert import tokenization\n",
    "tokenizer = tokenization.FullTokenizer(\n",
    "      vocab_file='albert/albert-base-2020-04-10/sp10m.cased.v10.vocab', do_lower_case=False,\n",
    "      spm_model_file='albert/albert-base-2020-04-10/sp10m.cased.v10.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def augment(string, label, maxlen = 400):\n",
    "    results, sentences, tokenized = [], '', []\n",
    "    splitted = split_into_sentences(string)\n",
    "    for no, s in enumerate(splitted):\n",
    "        t = tokenizer.tokenize(s)\n",
    "        if len(tokenized) + len(t) >= maxlen:\n",
    "            results.append(sentences.strip())\n",
    "            tokenized = tokenizer.tokenize(splitted[no-1])\n",
    "            sentences = splitted[no-1]\n",
    "\n",
    "        sentences += s + ' '\n",
    "        tokenized.extend(t)\n",
    "    if len(sentences):\n",
    "        results.append(sentences.strip())\n",
    "    return results, [label] * len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['akan membuat keputusan untuk menarik balik sepuluh kali apa yang akan diambil, di luar wikileaks, untuk membawa mesin clinton (tunai) ke bawah akan menang bukit dan kemudian mengumumkan wwiii terhadap russia iran syria \"paksi jahat\" akan timur tengah benar-benar meletup akan pivot untuk asia benar-benar meletup akan cina akan memerintah dunia menjelang 2025 di tengah-tengah begitu banyak pecahan kegilaan realiti geopolitik yang diwarnakan dengan rapi terhadap runtuhan kami, godaan tidak dapat ditarik kembali kepada tuan jean baudrillard yang terlambat, hebat, dekonstruksi. semasa mod post 1980-an ia menjadi pinggang menjadi inti america nya, yang asalnya diterbitkan dalam perancis pada tahun 1986, masih harus dibaca hari ini sebagai instagram budaya geologi metafisis definitif exceptionalistan. menjelang akhir tahun 1990-an, pada akhir alaf, dua tahun sebelum 9 11, peristiwa \"sebelum dan selepas\" yang seminalis telah menegaskan bagaimana kita hidup dalam labirin pasaran gelap. Kini, ia adalah pasaran paroxysm hitam. Orang ramai global tertakluk kepada pasaran kerja hitam seperti dalam deregulasi pasaran rasmi pasaran gelap pengangguran pasaran gelap spekulasi kewangan pasaran gelap kesengsaraan dan kemiskinan pasaran hitam seks (seperti dalam pelacuran) pasaran gelap maklumat (seperti dalam pengintipan dan peperangan bayang-bayang) pasaran hitam senjata dan juga pasaran pemikiran hitam. jalan di luar abad ke-20, pada tahun 2010, apa yang dipuji barat sebagai \"demokrasi liberal\" sebenarnya dtoliperal neoliberal telah hampir menyerap setiap perbezaan ideologi, sambil meninggalkan sebilangan perbezaan yang terapung dalam beberapa kesan trompe l\\'oeil. apa yang tersisa adalah suatu keadaan yang meluas dan beracun yang melarang larangan pemikiran apa-apa pemikiran kritikal, yang tidak mempunyai cara untuk meluahkan dirinya selain daripada menjadi rahsia (atau mencari niche internet yang tepat).',\n",
       "  'apa yang tersisa adalah suatu keadaan yang meluas dan beracun yang melarang larangan pemikiran apa-apa pemikiran kritikal, yang tidak mempunyai cara untuk meluahkan dirinya selain daripada menjadi rahsia (atau mencari niche internet yang tepat).baudrillard sudah tahu bahawa konsep \"mengubah\" yang dibunuh oleh kegembiraan tidak wujud di pasaran rasmi. jadi pasaran hitam yang \"berubah\" juga muncul, bersama dengan peniaga-peniaga yang, misalnya, alam perkauman, nativisme dan lain-lain bentuk pengecualian. baudrillard telah mengenal pasti bagaimana \"seludup mengubah\", yang dinyatakan oleh sekte dan setiap bentuk nasionalisme (kini, berfikir tentang spektrum antara jihadisme dan parti politik sayap kanan yang melampau) terikat untuk menjadi lebih ganas dalam masyarakat yang sangat tidak bertoleransi, taksub dengan regimentasi, dan sama sekali homogenisasi. terdapat banyak kegembiraan yang terbentuk dalam hidup yang hidup dalam koktail chimera yang membingungkan budaya, tanda-tanda, perbezaan dan \"nilai\" tetapi kemudian datang gandingan pemikiran dengan tepatnya replika kecerdasan buatan, bermain dengan garis penentuan antara manusia dan bukan manusia dalam domain pemikiran. Hasilnya, yang dipratonton oleh baudrillard, adalah rembesan masyarakat parapolitis dengan semacam mafia yang mengawal bentuk rahasia umum ini (pikir tuan finansial alam semesta). Kuasa tidak dapat melawan mafia ini dan itu, di atasnya, munafik, kerana mafia itu sendiri berasal dari kuasa. hasil akhirnya adalah apa yang sebenarnya penting hari ini, di mana sahaja, kebanyakannya cenderung berlaku di luar semua litar rasmi seperti dalam pasaran gelap sosial. adakah terdapat maklumat \"kebenaran\" baudrillard menunjukkan bagaimana ekonomi politik adalah mesin besar, menghasilkan nilai, menghasilkan tanda-tanda kekayaan, tetapi bukan kekayaan itu sendiri. sistem maklumat keseluruhan media yang masih diperintah oleh Amerika adalah peristiwa yang menghasilkan mesin besar sebagai tanda-tanda yang dapat ditukar nilai dalam pasaran universal ideologi, sistem bintang dan bencana.',\n",
       "  'sistem maklumat keseluruhan media yang masih diperintah oleh Amerika adalah peristiwa yang menghasilkan mesin besar sebagai tanda-tanda yang dapat ditukar nilai dalam pasaran universal ideologi, sistem bintang dan bencana.pengekstrakan maklumat ini berfungsi seperti dalam ekonomi yang menyalahgunakan bahan berkod, diuraikan terlebih dahulu, dan boleh dirunding dari segi model, seperti ekonomi melepaskan produk yang boleh dirunding dari segi harga dan nilai. kerana semua barangan, terima kasih kepada abstraksi nilai ini, boleh ditukar, maka setiap peristiwa (atau bukan peristiwa) juga boleh ditukar, semuanya menggantikan satu sama lain di pasaran budaya informasi. dan ini membawa kita ke mana kita hidup sekarang sejarah trans, dan politik trans yang peristiwa-peristiwa tidak benar-benar tidak berlaku, kerana mereka hilang dalam kekosongan maklumat (seperti ekonomi hilang dalam vakum spekulasi). oleh itu wawasan baudrillard yang sangat penting jika kita menganggap sejarah sebagai filem dan itulah yang sekarang maka \"kebenaran\" maklumat tidak lebih daripada sintesis pengeluaran pos, dubbing dan sub judul.'],\n",
       " [0, 0, 0])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "augment(train_X[0], train_Y[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 34900/34900 [01:39<00:00, 351.00it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "X_train, Y_train = [], []\n",
    "\n",
    "for i in tqdm(range(len(train_X))):\n",
    "    x, y = augment(train_X[i], train_Y[i])\n",
    "    X_train.extend(x)\n",
    "    Y_train.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(68887, 68887)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(X_train), len(Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8608/8608 [00:25<00:00, 332.17it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "X_test, Y_test = [], []\n",
    "\n",
    "for i in tqdm(range(len(test_X))):\n",
    "    x, y = augment(test_X[i], test_Y[i])\n",
    "    X_test.extend(x)\n",
    "    Y_test.extend(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, Y_train = shuffle(X_train, Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test, Y_test = shuffle(X_test, Y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('relevant-dataset.pkl', 'wb') as fopen:\n",
    "    pickle.dump({'train_X': X_train, 'train_Y': Y_train,\n",
    "                'test_X': X_test, 'test_Y': Y_test}, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
